In [1]:
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as p
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, cut_tree
from ISLP.cluster import compute_linkage
from scipy.cluster.hierarchy import linkage
In [2]:
file_path = r"C:\Users\tiles\Downloads\causes-of-death-in-children-under-5.csv"
df = pd.read_csv(file_path)
df.head()
Out[2]:
Entity Code Year Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) Deaths - Measles - Sex: Both - Age: Under 5 (Number) Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) ... Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) Deaths - Drowning - Sex: Both - Age: Under 5 (Number) Deaths - Malaria - Sex: Both - Age: Under 5 (Number) Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)
0 Afghanistan AFG 1990 48 105 1779 718 431 8649 477 ... 7112 2455 3968 131 802 808 10 776 21 123
1 Afghanistan AFG 1991 55 130 1822 741 439 8669 495 ... 7574 2385 4650 129 781 800 12 748 41 132
2 Afghanistan AFG 1992 68 155 2069 836 486 8539 554 ... 8614 2370 5833 137 821 863 13 777 51 180
3 Afghanistan AFG 1993 78 178 2427 970 549 8949 630 ... 9458 2659 7800 155 923 979 16 872 24 239
4 Afghanistan AFG 1994 83 194 2649 1063 589 10642 681 ... 9823 3187 7894 170 1015 1064 19 961 52 259

5 rows × 32 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6840 entries, 0 to 6839
Data columns (total 32 columns):
 #   Column                                                                                                 Non-Null Count  Dtype 
---  ------                                                                                                 --------------  ----- 
 0   Entity                                                                                                 6840 non-null   object
 1   Code                                                                                                   6150 non-null   object
 2   Year                                                                                                   6840 non-null   int64 
 3   Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number)                  6840 non-null   int64 
 4   Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number)                                    6840 non-null   int64 
 5   Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number)                                  6840 non-null   int64 
 6   Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number)                                           6840 non-null   int64 
 7   Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number)                                                 6840 non-null   int64 
 8   Deaths - Measles - Sex: Both - Age: Under 5 (Number)                                                   6840 non-null   int64 
 9   Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number)                                        6840 non-null   int64 
 10  Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number)                6840 non-null   int64 
 11  Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number)                                    6840 non-null   int64 
 12  Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number)                                   6840 non-null   int64 
 13  Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number)                                  6840 non-null   int64 
 14  Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)                              6840 non-null   int64 
 15  Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number)                                    6840 non-null   int64 
 16  Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number)                      6840 non-null   int64 
 17  Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number)             6840 non-null   int64 
 18  Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number)                              6840 non-null   int64 
 19  Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number)                                         6840 non-null   int64 
 20  Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number)  6840 non-null   int64 
 21  Deaths - Meningitis - Sex: Both - Age: Under 5 (Number)                                                6840 non-null   int64 
 22  Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number)                                  6840 non-null   int64 
 23  Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number)                                            6840 non-null   int64 
 24  Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number)                                        6840 non-null   int64 
 25  Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number)                            6840 non-null   int64 
 26  Deaths - Road injuries - Sex: Both - Age: Under 5 (Number)                                             6840 non-null   int64 
 27  Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number)                                              6840 non-null   int64 
 28  Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number)                                                  6840 non-null   int64 
 29  Deaths - Drowning - Sex: Both - Age: Under 5 (Number)                                                  6840 non-null   int64 
 30  Deaths - Malaria - Sex: Both - Age: Under 5 (Number)                                                   6840 non-null   int64 
 31  Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)                                                  6840 non-null   int64 
dtypes: int64(30), object(2)
memory usage: 1.7+ MB
In [4]:
df.describe()
Out[4]:
Year Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) Deaths - Measles - Sex: Both - Age: Under 5 (Number) Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number) Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number) ... Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) Deaths - Drowning - Sex: Both - Age: Under 5 (Number) Deaths - Malaria - Sex: Both - Age: Under 5 (Number) Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)
count 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 ... 6840.000000 6840.000000 6.840000e+03 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000 6840.000000
mean 2004.500000 1041.740789 399.418567 6392.636842 826.106433 1472.511550 8527.408772 1235.561696 262.530409 304.512135 ... 11726.084795 3876.111696 2.464864e+04 535.363743 1718.877047 2892.662719 3252.680702 2331.720468 12045.209064 2107.161111
std 8.656074 5943.506061 1549.064285 30815.191076 4399.035288 5794.139457 43502.336767 5006.538348 1119.050195 1208.144730 ... 51612.890640 16817.425576 1.133132e+05 2156.814008 6934.211045 13333.898943 18169.939174 10832.408381 64858.902628 9180.674890
min 1990.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1997.000000 0.000000 1.000000 1.000000 0.000000 7.000000 0.000000 2.000000 0.000000 1.000000 ... 18.750000 0.000000 3.000000e+00 2.000000 4.000000 0.000000 0.000000 3.000000 0.000000 0.000000
50% 2004.500000 1.000000 10.000000 17.000000 2.000000 44.000000 1.000000 25.000000 4.000000 7.000000 ... 178.000000 21.000000 9.800000e+01 15.000000 38.500000 11.000000 7.000000 27.000000 0.000000 12.000000
75% 2012.000000 42.000000 77.000000 903.250000 36.000000 283.000000 655.000000 290.250000 52.000000 78.000000 ... 1773.250000 671.000000 3.822250e+03 142.000000 359.250000 388.000000 298.250000 291.250000 217.250000 277.250000
max 2019.000000 62334.000000 21223.000000 524103.000000 50184.000000 85197.000000 704288.000000 77952.000000 15916.000000 18047.000000 ... 539952.000000 240021.000000 1.649581e+06 35583.000000 115624.000000 209562.000000 223680.000000 184096.000000 631523.000000 99248.000000

8 rows × 30 columns

In [5]:
df.isnull().sum()
Out[5]:
Entity                                                                                                     0
Code                                                                                                     690
Year                                                                                                       0
Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number)                      0
Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number)                                        0
Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number)                                      0
Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number)                                               0
Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number)                                                     0
Deaths - Measles - Sex: Both - Age: Under 5 (Number)                                                       0
Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number)                                            0
Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number)                    0
Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number)                                        0
Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number)                                       0
Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number)                                      0
Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)                                  0
Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number)                                        0
Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number)                          0
Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number)                 0
Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number)                                  0
Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number)                                             0
Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number)      0
Deaths - Meningitis - Sex: Both - Age: Under 5 (Number)                                                    0
Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number)                                      0
Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number)                                                0
Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number)                                            0
Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number)                                0
Deaths - Road injuries - Sex: Both - Age: Under 5 (Number)                                                 0
Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number)                                                  0
Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number)                                                      0
Deaths - Drowning - Sex: Both - Age: Under 5 (Number)                                                      0
Deaths - Malaria - Sex: Both - Age: Under 5 (Number)                                                       0
Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)                                                      0
dtype: int64
In [6]:
missing_code_entities_unique = df.loc[df['Code'].isnull(), 'Entity'].unique()
missing_code_entities_unique
Out[6]:
array(['African Region (WHO)', 'East Asia & Pacific (WB)',
       'Eastern Mediterranean Region (WHO)', 'England',
       'Europe & Central Asia (WB)', 'European Region (WHO)', 'G20',
       'Latin America & Caribbean (WB)',
       'Middle East & North Africa (WB)', 'North America (WB)',
       'Northern Ireland', 'OECD Countries',
       'Region of the Americas (WHO)', 'Scotland', 'South Asia (WB)',
       'South-East Asia Region (WHO)', 'Sub-Saharan Africa (WB)', 'Wales',
       'Western Pacific Region (WHO)', 'World Bank High Income',
       'World Bank Low Income', 'World Bank Lower Middle Income',
       'World Bank Upper Middle Income'], dtype=object)
In [7]:
df = df.dropna()
In [8]:
indices_to_drop = df[df['Entity'] == 'World'].index
df = df.drop(indices_to_drop)
In [9]:
df
Out[9]:
Entity Code Year Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) Deaths - Measles - Sex: Both - Age: Under 5 (Number) Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) ... Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) Deaths - Drowning - Sex: Both - Age: Under 5 (Number) Deaths - Malaria - Sex: Both - Age: Under 5 (Number) Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)
0 Afghanistan AFG 1990 48 105 1779 718 431 8649 477 ... 7112 2455 3968 131 802 808 10 776 21 123
1 Afghanistan AFG 1991 55 130 1822 741 439 8669 495 ... 7574 2385 4650 129 781 800 12 748 41 132
2 Afghanistan AFG 1992 68 155 2069 836 486 8539 554 ... 8614 2370 5833 137 821 863 13 777 51 180
3 Afghanistan AFG 1993 78 178 2427 970 549 8949 630 ... 9458 2659 7800 155 923 979 16 872 24 239
4 Afghanistan AFG 1994 83 194 2649 1063 589 10642 681 ... 9823 3187 7894 170 1015 1064 19 961 52 259
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6835 Zimbabwe ZWE 2015 106 31 1733 17 56 615 92 ... 2269 518 1345 114 115 799 2178 126 1475 399
6836 Zimbabwe ZWE 2016 112 32 1771 18 58 369 95 ... 2249 559 1286 119 120 787 1827 133 1219 398
6837 Zimbabwe ZWE 2017 111 32 1714 17 58 261 94 ... 2245 544 1248 117 119 745 1658 133 1249 394
6838 Zimbabwe ZWE 2018 109 31 1639 16 58 340 91 ... 2203 568 1136 114 115 693 1458 129 1213 397
6839 Zimbabwe ZWE 2019 108 31 1598 15 57 349 89 ... 2190 536 1067 112 112 661 1394 127 1207 413

6120 rows × 32 columns

In [10]:
rename_dict = {
    'Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number)': 'INTS_Deaths',
    'Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number)': 'Violence_Deaths',
    'Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number)': 'Nutrition_Deaths',
    'Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number)': 'Hepatitis_Deaths',
    'Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number)': 'Neoplasms_Deaths',
    'Deaths - Measles - Sex: Both - Age: Under 5 (Number)': 'Measles_Deaths',
    'Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number)': 'Digestive_Deaths',
    'Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number)': 'Cirrhosis_Deaths',
    'Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number)': 'Kidney_Deaths',
    'Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number)': 'Cardiovascular_Deaths',
    'Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number)': 'Congenital_Deaths',
    'Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)': 'Respiratory_Deaths',
    'Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number)': 'Preterm_Deaths',
    'Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number)': 'Heat_Cold_Deaths',
    'Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number)': 'Sepsis_Deaths',
    'Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number)': 'Nature_Deaths',
    'Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number)': 'Diabetes_Deaths',
    'Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number)': 'Encephalopathy_Deaths',
    'Deaths - Meningitis - Sex: Both - Age: Under 5 (Number)': 'Meningitis_Deaths',
    'Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number)': 'Other_Neonatal_Deaths',
    'Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number)': 'Whooping_Cough_Deaths',
    'Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number)': 'Diarrheal_Deaths',
    'Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number)': 'Fire_Heat_Deaths',
    'Deaths - Road injuries - Sex: Both - Age: Under 5 (Number)': 'Road_Deaths',
    'Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number)': 'Tuberculosis_Deaths',
    'Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number)': 'HIV_AIDS_Deaths',
    'Deaths - Drowning - Sex: Both - Age: Under 5 (Number)': 'Drowning_Deaths',
    'Deaths - Malaria - Sex: Both - Age: Under 5 (Number)': 'Malaria_Deaths',
    'Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)': 'Syphilis_Deaths'
}

df = df.rename(columns=rename_dict)
print(df.columns)
Index(['Entity', 'Code', 'Year', 'INTS_Deaths', 'Violence_Deaths',
       'Nutrition_Deaths', 'Hepatitis_Deaths', 'Neoplasms_Deaths',
       'Measles_Deaths', 'Digestive_Deaths', 'Cirrhosis_Deaths',
       'Kidney_Deaths', 'Cardiovascular_Deaths', 'Congenital_Deaths',
       'Respiratory_Deaths', 'Preterm_Deaths', 'Heat_Cold_Deaths',
       'Sepsis_Deaths', 'Nature_Deaths', 'Diabetes_Deaths',
       'Encephalopathy_Deaths', 'Meningitis_Deaths', 'Other_Neonatal_Deaths',
       'Whooping_Cough_Deaths', 'Diarrheal_Deaths', 'Fire_Heat_Deaths',
       'Road_Deaths', 'Tuberculosis_Deaths', 'HIV_AIDS_Deaths',
       'Drowning_Deaths', 'Malaria_Deaths', 'Syphilis_Deaths'],
      dtype='object')
In [11]:
cols = df.drop(columns=['Entity', 'Code', 'Year']).select_dtypes(include=np.number)
z_scores = cols.apply(zscore)
outlier_threshold = 3
outliers = (z_scores > outlier_threshold) | (z_scores < -outlier_threshold)
outlier_counts = outliers.sum()
print("Columns with outlier counts:")
print(outlier_counts[outlier_counts > 0])
Columns with outlier counts:
INTS_Deaths               38
Violence_Deaths           78
Nutrition_Deaths          55
Hepatitis_Deaths          33
Neoplasms_Deaths          70
Measles_Deaths            74
Digestive_Deaths         102
Cirrhosis_Deaths          78
Kidney_Deaths            141
Cardiovascular_Deaths     96
Congenital_Deaths         85
Respiratory_Deaths        73
Preterm_Deaths            58
Heat_Cold_Deaths          43
Sepsis_Deaths             75
Nature_Deaths             13
Diabetes_Deaths          177
Encephalopathy_Deaths    106
Meningitis_Deaths         97
Other_Neonatal_Deaths     48
Whooping_Cough_Deaths     83
Diarrheal_Deaths          57
Fire_Heat_Deaths          81
Road_Deaths              133
Tuberculosis_Deaths      125
HIV_AIDS_Deaths          165
Drowning_Deaths           68
Malaria_Deaths            82
Syphilis_Deaths          136
dtype: int64
In [12]:
filtered_data = df[(df['Year'] >= 1990) & (df['Year'] <= 2019)]
aggregated_data = filtered_data.groupby('Entity').sum()
aggregated_data = aggregated_data.drop(columns=['Year'])
aggregated_data
C:\Users\tiles\AppData\Local\Temp\ipykernel_24540\2972503105.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  aggregated_data = filtered_data.groupby('Entity').sum()
Out[12]:
INTS_Deaths Violence_Deaths Nutrition_Deaths Hepatitis_Deaths Neoplasms_Deaths Measles_Deaths Digestive_Deaths Cirrhosis_Deaths Kidney_Deaths Cardiovascular_Deaths ... Other_Neonatal_Deaths Whooping_Cough_Deaths Diarrheal_Deaths Fire_Heat_Deaths Road_Deaths Tuberculosis_Deaths HIV_AIDS_Deaths Drowning_Deaths Malaria_Deaths Syphilis_Deaths
Entity
Afghanistan 4355 6307 58382 23437 17974 199342 20764 7046 8904 8038 ... 322226 107240 236890 4528 26172 23411 1185 25157 3310 8450
Albania 0 85 457 13 798 196 1357 127 143 1073 ... 7584 258 505 142 324 29 0 426 0 150
Algeria 422 392 3812 1110 4393 20391 5265 1884 2848 10669 ... 78084 8150 26856 3659 26803 959 1158 5173 0 4200
American Samoa 0 0 17 0 0 33 0 0 0 0 ... 46 22 4 0 0 0 0 0 0 55
Andorra 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Venezuela 0 2990 12437 129 4801 62 3363 439 716 1058 ... 9828 686 38038 756 5376 755 1534 3369 523 432
Vietnam 928 1055 9295 667 11434 84565 10139 1777 2849 12965 ... 27792 33658 38757 3021 9956 10835 2072 65895 1645 9328
Yemen 1924 1201 52547 6388 8405 100086 10464 3756 3108 19717 ... 252675 32201 397881 7836 54648 5338 1759 12282 7081 10606
Zambia 1503 2487 68820 1015 13625 57746 8670 1752 2269 5779 ... 59476 27087 230238 4001 8373 25753 216174 6805 124161 38809
Zimbabwe 2005 771 33936 334 1183 28844 2051 194 531 3908 ... 62915 12635 47178 2267 2200 17075 236368 2297 56942 11729

204 rows × 29 columns

In [13]:
total_deaths_per_entity = df.groupby('Entity').sum().sum(axis=1)
top_50_countries = total_deaths_per_entity.sort_values(ascending=False).head(50)

colors = plt.cm.cool(np.linspace(0, 1, len(top_50_countries)))

plt.figure(figsize=(12, 8))
top_50_countries.plot(kind='bar', color=colors)
plt.title('Top 50 Countries by Total Number of Deaths')
plt.xlabel('Country')
plt.ylabel('Total Number of Deaths')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
C:\Users\tiles\AppData\Local\Temp\ipykernel_24540\11496973.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  total_deaths_per_entity = df.groupby('Entity').sum().sum(axis=1)
No description has been provided for this image
In [14]:
causes_of_death = df.drop(columns=['Entity', 'Code', 'Year']).sum()
top_20_causes = causes_of_death.sort_values(ascending=False).head(20)

colors = plt.cm.cool(np.linspace(0, 1, len(top_20_causes)))

plt.figure(figsize=(12, 8))
top_20_causes.plot(kind='bar', color=colors)
plt.title('Top 20 Causes of Death')
plt.xlabel('Cause of Death')
plt.ylabel('Total Number of Deaths')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
df_map = df.copy()

df_map['Total_Deaths'] = df_map.drop(columns=['Entity', 'Code', 'Year']).sum(axis=1)
yearly_deaths = df_map.groupby(['Entity', 'Year'])['Total_Deaths'].sum().reset_index()

fig = px.choropleth(yearly_deaths,
                    locations="Entity",
                    locationmode='country names',
                    color="Total_Deaths",
                    hover_name="Entity",
                    animation_frame="Year",
                    color_continuous_scale='Reds',
                    title="Total Deaths from 1990 to 2019")

fig.update_layout(
    geo=dict(showframe=False, showcoastlines=False, projection_type='equirectangular'),
    title=dict(x=0.5)
)

fig.show()
In [16]:
import matplotlib.pyplot as plt

df_cause = df.drop(columns=['Entity', 'Code'])
df_yearly = df_cause.groupby('Year').sum().reset_index()

if 'Total_Deaths' in df_yearly.columns:
    df_yearly = df_yearly.drop(columns='Total_Deaths')

num_causes = len(df_yearly.columns) - 1  
colors = plt.cm.tab10(np.linspace(0, 1, num_causes))

plt.figure(figsize=(12, 8))
for i, c in enumerate(df_yearly.columns[1:], start=1):
    plt.plot(df_yearly['Year'], df_yearly[c], label=c, color=colors[i % num_causes])

plt.title('Deaths by Disease from 1990 to 2019')
plt.xlabel('Year')
plt.ylabel('Number of Deaths')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)
plt.tight_layout()
plt.show()
No description has been provided for this image

Models¶

In [17]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['Entity', 'Code', 'Year']))
pca = PCA()
pca_out = pca.fit_transform(df_scaled)
In [18]:
print(pd.DataFrame({'Center': scaler.mean_, 'Scale': scaler.scale_}, index=df.columns.drop(['Entity', 'Code', 'Year'])))
                            Center         Scale
INTS_Deaths             231.492320   1622.368214
Violence_Deaths          79.825817    303.457690
Nutrition_Deaths       1333.719118   6972.497881
Hepatitis_Deaths        160.559967   1616.560333
Neoplasms_Deaths        297.492974   1204.333079
Measles_Deaths         1814.687092   8421.696247
Digestive_Deaths        253.901797    938.604887
Cirrhosis_Deaths         54.085621    238.815144
Kidney_Deaths            63.699020    180.199557
Cardiovascular_Deaths   236.582190    898.611133
Congenital_Deaths      3175.634477  10591.726971
Respiratory_Deaths     6722.154902  30161.351947
Preterm_Deaths         4808.332026  20841.947352
Heat_Cold_Deaths         17.039706    123.145252
Sepsis_Deaths          1271.867157   4650.791920
Nature_Deaths            24.220098    505.365350
Diabetes_Deaths          16.040850     44.805750
Encephalopathy_Deaths  3647.752614  15066.581181
Meningitis_Deaths      1019.708007   4083.534715
Other_Neonatal_Deaths  2374.598366  14282.461832
Whooping_Cough_Deaths   807.175163   3630.187080
Diarrheal_Deaths       5213.969771  22619.833294
Fire_Heat_Deaths        111.063399    358.777053
Road_Deaths             352.145588   1244.571219
Tuberculosis_Deaths     611.118464   2500.691503
HIV_AIDS_Deaths         707.966013   2634.357974
Drowning_Deaths         469.680556   2781.121195
Malaria_Deaths         2654.809150  12169.034911
Syphilis_Deaths         448.185294   1475.692709
In [19]:
print("Number of Principal Components:", pca.n_components_)
Number of Principal Components: 29
In [20]:
components_df = pd.DataFrame(pca.components_.T, index=df.columns.drop(['Entity', 'Code', 'Year']), columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print(components_df)
                            PC1       PC2       PC3       PC4       PC5  \
INTS_Deaths            0.094227 -0.279744  0.463189 -0.097988 -0.311449   
Violence_Deaths        0.192083  0.222668  0.146339 -0.055789 -0.035394   
Nutrition_Deaths       0.185846 -0.098552 -0.229165  0.030813  0.090948   
Hepatitis_Deaths       0.181262 -0.071792 -0.355795 -0.059059 -0.015808   
Neoplasms_Deaths       0.189168  0.308176  0.059227 -0.001661  0.086901   
Measles_Deaths         0.190922 -0.195315 -0.058128  0.001188 -0.004194   
Digestive_Deaths       0.204316  0.154755  0.155294 -0.032092 -0.091682   
Cirrhosis_Deaths       0.207761 -0.142255 -0.120225 -0.020686 -0.116311   
Kidney_Deaths          0.203901  0.060996  0.144088  0.068227 -0.049469   
Cardiovascular_Deaths  0.153002  0.310854  0.170639 -0.037460 -0.036617   
Congenital_Deaths      0.210769  0.182265  0.024624 -0.018932 -0.016313   
Respiratory_Deaths     0.221859  0.003205 -0.048193 -0.041515 -0.054279   
Preterm_Deaths         0.212775 -0.004188 -0.175389 -0.018307 -0.026728   
Heat_Cold_Deaths       0.201447  0.150567 -0.138287 -0.088340 -0.017715   
Sepsis_Deaths          0.204937 -0.181966 -0.100790  0.021038  0.033891   
Nature_Deaths          0.021530  0.025108 -0.033360  0.933804 -0.320722   
Diabetes_Deaths        0.151425  0.185858  0.211426  0.150184  0.291552   
Encephalopathy_Deaths  0.205653 -0.058167 -0.058967  0.010093  0.001525   
Meningitis_Deaths      0.200081 -0.191567  0.151517 -0.053601 -0.146360   
Other_Neonatal_Deaths  0.186166 -0.095823 -0.326120 -0.005729  0.037272   
Whooping_Cough_Deaths  0.216636 -0.095771 -0.121720 -0.004722  0.002849   
Diarrheal_Deaths       0.204768 -0.223387  0.033048 -0.042289 -0.123416   
Fire_Heat_Deaths       0.218313  0.060347  0.051362 -0.028369 -0.029486   
Road_Deaths            0.182034  0.251374  0.167682 -0.015125 -0.015366   
Tuberculosis_Deaths    0.205832 -0.118289 -0.014162  0.037863  0.044409   
HIV_AIDS_Deaths        0.067126 -0.195497  0.223282  0.218046  0.743423   
Drowning_Deaths        0.178385  0.319090 -0.047663  0.013700  0.006005   
Malaria_Deaths         0.129861 -0.318591  0.386044 -0.031780 -0.070264   
Syphilis_Deaths        0.193933 -0.157282 -0.001519  0.118707  0.260574   

                            PC6       PC7       PC8       PC9      PC10  ...  \
INTS_Deaths           -0.150491 -0.160460 -0.168650  0.019196  0.068765  ...   
Violence_Deaths       -0.211198 -0.073727 -0.162915 -0.251523  0.142197  ...   
Nutrition_Deaths       0.162417  0.419810 -0.199878 -0.001383  0.167245  ...   
Hepatitis_Deaths      -0.181882 -0.073030 -0.109682  0.225578  0.130812  ...   
Neoplasms_Deaths      -0.085367  0.004181  0.014093 -0.241094 -0.110874  ...   
Measles_Deaths         0.131766  0.354851  0.323575 -0.326737  0.191139  ...   
Digestive_Deaths      -0.102466 -0.013309 -0.087361 -0.031306 -0.249631  ...   
Cirrhosis_Deaths      -0.004083 -0.084571 -0.035769  0.175154 -0.093034  ...   
Kidney_Deaths          0.238568 -0.130608  0.174932 -0.108956 -0.115535  ...   
Cardiovascular_Deaths -0.069920  0.144762  0.567575  0.467288  0.356480  ...   
Congenital_Deaths     -0.090481 -0.163793 -0.011526  0.023456 -0.043899  ...   
Respiratory_Deaths    -0.076192  0.086761 -0.050587 -0.051147  0.123885  ...   
Preterm_Deaths        -0.084886 -0.221464  0.010628  0.092853 -0.044653  ...   
Heat_Cold_Deaths      -0.275361  0.125037 -0.239183  0.012007  0.136205  ...   
Sepsis_Deaths          0.034137 -0.287823 -0.004858  0.011011  0.111145  ...   
Nature_Deaths         -0.124072  0.033323 -0.017946  0.017272  0.030310  ...   
Diabetes_Deaths        0.524986 -0.306077 -0.212995  0.026004  0.468122  ...   
Encephalopathy_Deaths  0.027057 -0.265309  0.344594 -0.302319 -0.190070  ...   
Meningitis_Deaths     -0.045498 -0.004897  0.197662 -0.228545  0.005492  ...   
Other_Neonatal_Deaths -0.070486 -0.265279  0.060426  0.175507  0.013000  ...   
Whooping_Cough_Deaths -0.000840  0.023613 -0.063161  0.040207  0.011448  ...   
Diarrheal_Deaths      -0.003502  0.130329  0.078196 -0.001123  0.121607  ...   
Fire_Heat_Deaths      -0.076153  0.056770 -0.007674 -0.077277  0.032710  ...   
Road_Deaths            0.066890  0.188621  0.020785  0.382679 -0.417345  ...   
Tuberculosis_Deaths    0.300646  0.298881 -0.041852  0.027531 -0.235244  ...   
HIV_AIDS_Deaths       -0.472547  0.074565  0.070075 -0.014049 -0.019501  ...   
Drowning_Deaths        0.021078  0.156610 -0.273636 -0.205253 -0.059690  ...   
Malaria_Deaths         0.005307  0.110629 -0.258285  0.230162  0.052138  ...   
Syphilis_Deaths        0.234327 -0.102497 -0.021082  0.145453 -0.333198  ...   

                           PC20      PC21      PC22      PC23      PC24  \
INTS_Deaths            0.097433  0.001448 -0.074810 -0.094545 -0.097994   
Violence_Deaths       -0.173820 -0.033710 -0.050523  0.083480  0.103140   
Nutrition_Deaths      -0.127458 -0.044478  0.009286  0.094013 -0.186105   
Hepatitis_Deaths      -0.236619 -0.216256 -0.077713  0.303019  0.177520   
Neoplasms_Deaths      -0.092602  0.080114 -0.037015 -0.206970  0.142890   
Measles_Deaths         0.053259 -0.115713 -0.123097 -0.015506 -0.142276   
Digestive_Deaths      -0.055170 -0.379395  0.051284 -0.150414 -0.219953   
Cirrhosis_Deaths      -0.092435 -0.357556 -0.195848 -0.003489  0.239556   
Kidney_Deaths         -0.321186  0.343685 -0.127534 -0.012511 -0.199787   
Cardiovascular_Deaths  0.055162 -0.101249  0.098490 -0.050023  0.073617   
Congenital_Deaths     -0.166603  0.104662  0.302517 -0.228667  0.095590   
Respiratory_Deaths    -0.110788  0.086398 -0.053321 -0.144675 -0.227351   
Preterm_Deaths         0.108676 -0.137703  0.254089 -0.071234 -0.148213   
Heat_Cold_Deaths       0.035002  0.207158 -0.148331 -0.029819 -0.384965   
Sepsis_Deaths          0.647155  0.229967 -0.002104  0.026460 -0.036458   
Nature_Deaths          0.001507 -0.001788  0.001029  0.002632 -0.003212   
Diabetes_Deaths       -0.042636 -0.117215 -0.058150  0.066593 -0.010476   
Encephalopathy_Deaths  0.009330 -0.222256 -0.415252  0.056801  0.066756   
Meningitis_Deaths     -0.115091  0.092793  0.559466  0.541645 -0.002258   
Other_Neonatal_Deaths -0.085579  0.118892  0.098032 -0.148898 -0.244146   
Whooping_Cough_Deaths -0.121116  0.456805 -0.059503 -0.087762  0.564668   
Diarrheal_Deaths       0.047733  0.118182 -0.224383 -0.040080  0.085454   
Fire_Heat_Deaths       0.321166 -0.138664  0.057472 -0.110696  0.131770   
Road_Deaths            0.150643  0.216489 -0.267263  0.347573 -0.100635   
Tuberculosis_Deaths    0.081210 -0.069895  0.275319 -0.421664  0.135779   
HIV_AIDS_Deaths        0.004064 -0.001969 -0.019463  0.000040  0.015408   
Drowning_Deaths        0.311309 -0.091829  0.079973  0.269858  0.182744   
Malaria_Deaths        -0.119786 -0.071422 -0.039554 -0.015174  0.069500   
Syphilis_Deaths       -0.061982 -0.062689  0.100664  0.136883 -0.121650   

                           PC25      PC26      PC27      PC28      PC29  
INTS_Deaths           -0.088373  0.044640 -0.002440 -0.108274 -0.045844  
Violence_Deaths        0.057892  0.050430  0.056767  0.021887  0.028849  
Nutrition_Deaths       0.060651 -0.031770 -0.038811 -0.111907 -0.029652  
Hepatitis_Deaths      -0.400324 -0.138143 -0.279816 -0.366895  0.054902  
Neoplasms_Deaths      -0.388595  0.351079  0.316007 -0.234096  0.146334  
Measles_Deaths         0.086526  0.055086  0.052071  0.008244 -0.066508  
Digestive_Deaths       0.307334  0.024007 -0.262471 -0.292995 -0.095235  
Cirrhosis_Deaths      -0.147981  0.235191  0.326707  0.486375 -0.074721  
Kidney_Deaths         -0.228333 -0.123653  0.026343 -0.201089 -0.040781  
Cardiovascular_Deaths -0.002784 -0.016565  0.019231  0.005018 -0.027578  
Congenital_Deaths     -0.059441 -0.270984 -0.333200  0.332474 -0.250917  
Respiratory_Deaths     0.009130  0.014471 -0.242095  0.348853  0.754397  
Preterm_Deaths         0.220018 -0.408043  0.585638 -0.214154  0.257328  
Heat_Cold_Deaths      -0.105954 -0.203076  0.198300  0.237287 -0.404382  
Sepsis_Deaths         -0.236459 -0.000124 -0.138853 -0.011869  0.045211  
Nature_Deaths         -0.004240  0.002690 -0.001426  0.004114 -0.000293  
Diabetes_Deaths        0.077620  0.030078  0.022936  0.017752 -0.040112  
Encephalopathy_Deaths  0.131748 -0.218342 -0.136616  0.111867 -0.068003  
Meningitis_Deaths     -0.029798  0.122174  0.048276  0.108004 -0.035973  
Other_Neonatal_Deaths  0.265599  0.635336 -0.080919 -0.078857 -0.164563  
Whooping_Cough_Deaths  0.440254 -0.084923  0.022981 -0.074209  0.032438  
Diarrheal_Deaths       0.022970 -0.035097  0.047334 -0.171974 -0.151255  
Fire_Heat_Deaths      -0.004262  0.065565 -0.176340 -0.107244  0.071756  
Road_Deaths            0.066116  0.072285 -0.019305  0.017310  0.088178  
Tuberculosis_Deaths   -0.252701 -0.057748 -0.033566 -0.002233 -0.090820  
HIV_AIDS_Deaths       -0.001889 -0.011943 -0.000415  0.002573 -0.014906  
Drowning_Deaths        0.118622  0.073175 -0.012029  0.047145 -0.063032  
Malaria_Deaths         0.074032  0.002485  0.052472  0.004714  0.040699  
Syphilis_Deaths       -0.041984 -0.047687  0.012801  0.092254  0.028923  

[29 rows x 29 columns]
In [24]:
plt.figure(figsize=(14, 10))

for i in range(28): 
    plt.scatter(pca_out[:, i], pca_out[:, i + 1], label=f'PC{i+1} vs PC{i+2}', s=5)

plt.xlabel('Principal Component')
plt.ylabel('Principal Component')
plt.title('Scatter Plot of the First 29 Principal Components')
plt.legend()
plt.show()
No description has been provided for this image
In [25]:
print("Explained Variance:", pca.explained_variance_)
Explained Variance: [1.97196643e+01 2.45048380e+00 2.19999567e+00 1.00752567e+00
 9.58763219e-01 5.18799377e-01 4.43362442e-01 3.55857051e-01
 3.02347592e-01 2.21220670e-01 2.04948758e-01 1.25738712e-01
 1.04572661e-01 8.55214733e-02 5.88939451e-02 4.80105053e-02
 4.28070082e-02 3.57672799e-02 2.20169171e-02 2.13706380e-02
 1.66299190e-02 1.26423574e-02 1.15981840e-02 9.93009277e-03
 7.62041773e-03 5.67412152e-03 5.28917665e-03 4.89361606e-03
 2.79377675e-03]
In [26]:
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
Explained Variance Ratio: [6.79877314e-01 8.44856343e-02 7.58495239e-02 3.47365877e-02
 3.30553985e-02 1.78867106e-02 1.52858620e-02 1.22689277e-02
 1.04240755e-02 7.62705251e-03 7.06604378e-03 4.33510917e-03
 3.60536462e-03 2.94853446e-03 2.03049386e-03 1.65526415e-03
 1.47586254e-03 1.23315295e-03 7.59079984e-04 7.36798139e-04
 5.73351781e-04 4.35872127e-04 3.99872030e-04 3.42361042e-04
 2.62730088e-04 1.95627392e-04 1.82355600e-04 1.68717809e-04
 9.63213881e-05]
In [27]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.title('Proportion of Variance Explained by Principal Components')
plt.grid(True)
plt.show()
No description has been provided for this image
In [28]:
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.title('Cumulative Proportion of Variance Explained by Principal Components')
plt.grid(True)
plt.show()
No description has been provided for this image
In [29]:
pca = PCA(n_components=7)
pca.fit(df_scaled)
components_df = pd.DataFrame(pca.components_.T, index=df.columns.drop(['Entity', 'Code', 'Year']), columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print(components_df)
pca_out = pca.transform(df_scaled)
                            PC1       PC2       PC3       PC4       PC5  \
INTS_Deaths            0.094227 -0.279744  0.463189 -0.097988 -0.311449   
Violence_Deaths        0.192083  0.222668  0.146339 -0.055789 -0.035394   
Nutrition_Deaths       0.185846 -0.098552 -0.229165  0.030813  0.090948   
Hepatitis_Deaths       0.181262 -0.071792 -0.355795 -0.059059 -0.015808   
Neoplasms_Deaths       0.189168  0.308176  0.059227 -0.001661  0.086901   
Measles_Deaths         0.190922 -0.195315 -0.058128  0.001188 -0.004194   
Digestive_Deaths       0.204316  0.154755  0.155294 -0.032092 -0.091682   
Cirrhosis_Deaths       0.207761 -0.142255 -0.120225 -0.020686 -0.116311   
Kidney_Deaths          0.203901  0.060996  0.144088  0.068227 -0.049469   
Cardiovascular_Deaths  0.153002  0.310854  0.170639 -0.037460 -0.036617   
Congenital_Deaths      0.210769  0.182265  0.024624 -0.018932 -0.016313   
Respiratory_Deaths     0.221859  0.003205 -0.048193 -0.041515 -0.054279   
Preterm_Deaths         0.212775 -0.004188 -0.175389 -0.018307 -0.026728   
Heat_Cold_Deaths       0.201447  0.150567 -0.138287 -0.088340 -0.017715   
Sepsis_Deaths          0.204937 -0.181966 -0.100790  0.021038  0.033891   
Nature_Deaths          0.021530  0.025108 -0.033360  0.933804 -0.320722   
Diabetes_Deaths        0.151425  0.185858  0.211426  0.150184  0.291552   
Encephalopathy_Deaths  0.205653 -0.058167 -0.058967  0.010093  0.001525   
Meningitis_Deaths      0.200081 -0.191567  0.151517 -0.053601 -0.146360   
Other_Neonatal_Deaths  0.186166 -0.095823 -0.326120 -0.005729  0.037272   
Whooping_Cough_Deaths  0.216636 -0.095771 -0.121720 -0.004722  0.002849   
Diarrheal_Deaths       0.204768 -0.223387  0.033048 -0.042289 -0.123416   
Fire_Heat_Deaths       0.218313  0.060347  0.051362 -0.028369 -0.029486   
Road_Deaths            0.182034  0.251374  0.167682 -0.015125 -0.015366   
Tuberculosis_Deaths    0.205832 -0.118289 -0.014162  0.037863  0.044409   
HIV_AIDS_Deaths        0.067126 -0.195497  0.223282  0.218046  0.743423   
Drowning_Deaths        0.178385  0.319090 -0.047663  0.013700  0.006005   
Malaria_Deaths         0.129861 -0.318591  0.386044 -0.031780 -0.070264   
Syphilis_Deaths        0.193933 -0.157282 -0.001519  0.118707  0.260574   

                            PC6       PC7  
INTS_Deaths           -0.150491 -0.160460  
Violence_Deaths       -0.211198 -0.073727  
Nutrition_Deaths       0.162417  0.419810  
Hepatitis_Deaths      -0.181882 -0.073030  
Neoplasms_Deaths      -0.085367  0.004181  
Measles_Deaths         0.131766  0.354851  
Digestive_Deaths      -0.102466 -0.013309  
Cirrhosis_Deaths      -0.004083 -0.084571  
Kidney_Deaths          0.238568 -0.130608  
Cardiovascular_Deaths -0.069920  0.144762  
Congenital_Deaths     -0.090481 -0.163793  
Respiratory_Deaths    -0.076192  0.086761  
Preterm_Deaths        -0.084886 -0.221464  
Heat_Cold_Deaths      -0.275361  0.125037  
Sepsis_Deaths          0.034137 -0.287823  
Nature_Deaths         -0.124072  0.033323  
Diabetes_Deaths        0.524986 -0.306077  
Encephalopathy_Deaths  0.027057 -0.265309  
Meningitis_Deaths     -0.045498 -0.004897  
Other_Neonatal_Deaths -0.070486 -0.265279  
Whooping_Cough_Deaths -0.000840  0.023613  
Diarrheal_Deaths      -0.003502  0.130329  
Fire_Heat_Deaths      -0.076153  0.056770  
Road_Deaths            0.066890  0.188621  
Tuberculosis_Deaths    0.300646  0.298881  
HIV_AIDS_Deaths       -0.472547  0.074565  
Drowning_Deaths        0.021078  0.156610  
Malaria_Deaths         0.005307  0.110629  
Syphilis_Deaths        0.234327 -0.102497  
In [30]:
plt.figure(figsize=(14, 10))

for i in range(6): 
    plt.scatter(pca_out[:, i], pca_out[:, i + 1], label=f'PC{i+1} vs PC{i+2}', s=5)

plt.xlabel('Principal Component')
plt.ylabel('Principal Component')
plt.title('Scatter Plot of the First Seven Principal Components')
plt.legend()
plt.show()
No description has been provided for this image

SVD¶

In [31]:
U, s, V = np.linalg.svd(df_scaled, full_matrices=False)
In [32]:
np.round(V.T, 3)
Out[32]:
array([[ 0.094,  0.28 , -0.463,  0.098, -0.311,  0.15 , -0.16 ,  0.169,
        -0.019,  0.069, -0.101, -0.211,  0.041,  0.095,  0.002, -0.224,
        -0.349,  0.434,  0.163,  0.097,  0.001,  0.075,  0.095, -0.098,
        -0.088, -0.045, -0.002, -0.108, -0.046],
       [ 0.192, -0.223, -0.146,  0.056, -0.035,  0.211, -0.074,  0.163,
         0.252,  0.142,  0.175,  0.048,  0.298,  0.036, -0.169, -0.378,
         0.593,  0.092,  0.103, -0.174, -0.034,  0.051, -0.083,  0.103,
         0.058, -0.05 ,  0.057,  0.022,  0.029],
       [ 0.186,  0.099,  0.229, -0.031,  0.091, -0.162,  0.42 ,  0.2  ,
         0.001,  0.167, -0.187, -0.241,  0.592, -0.232, -0.066,  0.116,
        -0.133,  0.08 ,  0.055, -0.127, -0.044, -0.009, -0.094, -0.186,
         0.061,  0.032, -0.039, -0.112, -0.03 ],
       [ 0.181,  0.072,  0.356,  0.059, -0.016,  0.182, -0.073,  0.11 ,
        -0.226,  0.131,  0.092,  0.08 , -0.225,  0.034,  0.019, -0.044,
        -0.038,  0.085,  0.091, -0.237, -0.216,  0.078, -0.303,  0.178,
        -0.4  ,  0.138, -0.28 , -0.367,  0.055],
       [ 0.189, -0.308, -0.059,  0.002,  0.087,  0.085,  0.004, -0.014,
         0.241, -0.111,  0.198, -0.09 ,  0.089,  0.04 ,  0.02 ,  0.39 ,
        -0.174,  0.079, -0.068, -0.093,  0.08 ,  0.037,  0.207,  0.143,
        -0.389, -0.351,  0.316, -0.234,  0.146],
       [ 0.191,  0.195,  0.058, -0.001, -0.004, -0.132,  0.355, -0.324,
         0.327,  0.191,  0.114,  0.356, -0.248,  0.188, -0.295,  0.098,
        -0.022,  0.231,  0.272,  0.053, -0.116,  0.123,  0.016, -0.142,
         0.087, -0.055,  0.052,  0.008, -0.067],
       [ 0.204, -0.155, -0.155,  0.032, -0.092,  0.102, -0.013,  0.087,
         0.031, -0.25 , -0.181,  0.234,  0.07 ,  0.229,  0.338,  0.249,
         0.098, -0.091,  0.028, -0.055, -0.379, -0.051,  0.15 , -0.22 ,
         0.307, -0.024, -0.262, -0.293, -0.095],
       [ 0.208,  0.142,  0.12 ,  0.021, -0.116,  0.004, -0.085,  0.036,
        -0.175, -0.093, -0.353,  0.221,  0.12 ,  0.015,  0.054, -0.016,
        -0.04 , -0.082,  0.05 , -0.092, -0.358,  0.196,  0.003,  0.24 ,
        -0.148, -0.235,  0.327,  0.486, -0.075],
       [ 0.204, -0.061, -0.144, -0.068, -0.049, -0.239, -0.131, -0.175,
         0.109, -0.116, -0.498,  0.007, -0.13 , -0.074, -0.144, -0.133,
         0.078, -0.278,  0.13 , -0.321,  0.344,  0.128,  0.013, -0.2  ,
        -0.228,  0.124,  0.026, -0.201, -0.041],
       [ 0.153, -0.311, -0.171,  0.037, -0.037,  0.07 ,  0.145, -0.568,
        -0.467,  0.356, -0.084, -0.287,  0.042,  0.126,  0.047,  0.018,
         0.094,  0.004,  0.063,  0.055, -0.101, -0.098,  0.05 ,  0.074,
        -0.003,  0.017,  0.019,  0.005, -0.028],
       [ 0.211, -0.182, -0.025,  0.019, -0.016,  0.09 , -0.164,  0.012,
        -0.023, -0.044,  0.002,  0.185,  0.062, -0.152, -0.348,  0.228,
        -0.129,  0.251, -0.128, -0.167,  0.105, -0.303,  0.229,  0.096,
        -0.059,  0.271, -0.333,  0.332, -0.251],
       [ 0.222, -0.003,  0.048,  0.042, -0.054,  0.076,  0.087,  0.051,
         0.051,  0.124, -0.053, -0.044, -0.138,  0.042,  0.152, -0.023,
        -0.017,  0.014, -0.123, -0.111,  0.086,  0.053,  0.145, -0.227,
         0.009, -0.014, -0.242,  0.349,  0.754],
       [ 0.213,  0.004,  0.175,  0.018, -0.027,  0.085, -0.221, -0.011,
        -0.093, -0.045, -0.005,  0.081, -0.052, -0.175, -0.11 , -0.038,
        -0.032,  0.092, -0.024,  0.109, -0.138, -0.254,  0.071, -0.148,
         0.22 ,  0.408,  0.586, -0.214,  0.257],
       [ 0.201, -0.151,  0.138,  0.088, -0.018,  0.275,  0.125,  0.239,
        -0.012,  0.136,  0.209, -0.082, -0.139,  0.169,  0.227, -0.016,
        -0.089, -0.229,  0.069,  0.035,  0.207,  0.148,  0.03 , -0.385,
        -0.106,  0.203,  0.198,  0.237, -0.404],
       [ 0.205,  0.182,  0.101, -0.021,  0.034, -0.034, -0.288,  0.005,
        -0.011,  0.111, -0.103,  0.094,  0.205,  0.031,  0.066,  0.277,
         0.31 , -0.008,  0.16 ,  0.647,  0.23 ,  0.002, -0.026, -0.036,
        -0.236,  0.   , -0.139, -0.012,  0.045],
       [ 0.022, -0.025,  0.033, -0.934, -0.321,  0.124,  0.033,  0.018,
        -0.017,  0.03 ,  0.068, -0.001,  0.003,  0.003,  0.008,  0.007,
         0.001,  0.002,  0.001,  0.002, -0.002, -0.001, -0.003, -0.003,
        -0.004, -0.003, -0.001,  0.004, -0.   ],
       [ 0.151, -0.186, -0.211, -0.15 ,  0.292, -0.525, -0.306,  0.213,
        -0.026,  0.468,  0.123,  0.166, -0.075, -0.054,  0.196, -0.039,
        -0.166,  0.035, -0.055, -0.043, -0.117,  0.058, -0.067, -0.01 ,
         0.078, -0.03 ,  0.023,  0.018, -0.04 ],
       [ 0.206,  0.058,  0.059, -0.01 ,  0.002, -0.027, -0.265, -0.345,
         0.302, -0.19 ,  0.221, -0.387,  0.035, -0.277,  0.139,  0.052,
        -0.053, -0.012,  0.033,  0.009, -0.222,  0.415, -0.057,  0.067,
         0.132,  0.218, -0.137,  0.112, -0.068],
       [ 0.2  ,  0.192, -0.152,  0.054, -0.146,  0.045, -0.005, -0.198,
         0.229,  0.005,  0.087,  0.047,  0.034, -0.07 ,  0.285,  0.027,
        -0.092, -0.12 , -0.05 , -0.115,  0.093, -0.559, -0.542, -0.002,
        -0.03 , -0.122,  0.048,  0.108, -0.036],
       [ 0.186,  0.096,  0.326,  0.006,  0.037,  0.07 , -0.265, -0.06 ,
        -0.176,  0.013,  0.068, -0.127, -0.155, -0.189, -0.11 , -0.122,
         0.044,  0.046, -0.005, -0.086,  0.119, -0.098,  0.149, -0.244,
         0.266, -0.635, -0.081, -0.079, -0.165],
       [ 0.217,  0.096,  0.122,  0.005,  0.003,  0.001,  0.024,  0.063,
        -0.04 ,  0.011, -0.015,  0.01 ,  0.025,  0.24 ,  0.17 ,  0.033,
        -0.133, -0.022,  0.234, -0.121,  0.457,  0.06 ,  0.088,  0.565,
         0.44 ,  0.085,  0.023, -0.074,  0.032],
       [ 0.205,  0.223, -0.033,  0.042, -0.123,  0.004,  0.13 , -0.078,
         0.001,  0.122, -0.045,  0.106, -0.004,  0.025,  0.051, -0.047,
         0.128,  0.072, -0.825,  0.048,  0.118,  0.224,  0.04 ,  0.085,
         0.023,  0.035,  0.047, -0.172, -0.151],
       [ 0.218, -0.06 , -0.051,  0.028, -0.029,  0.076,  0.057,  0.008,
         0.077,  0.033,  0.111,  0.094,  0.125, -0.008, -0.314, -0.326,
        -0.379, -0.577, -0.048,  0.321, -0.139, -0.057,  0.111,  0.132,
        -0.004, -0.066, -0.176, -0.107,  0.072],
       [ 0.182, -0.251, -0.168,  0.015, -0.015, -0.067,  0.189, -0.021,
        -0.383, -0.417,  0.194,  0.287,  0.052, -0.233, -0.011, -0.081,
        -0.078,  0.18 ,  0.046,  0.151,  0.216,  0.267, -0.348, -0.101,
         0.066, -0.072, -0.019,  0.017,  0.088],
       [ 0.206,  0.118,  0.014, -0.038,  0.044, -0.301,  0.299,  0.042,
        -0.028, -0.235,  0.15 , -0.06 , -0.135, -0.184,  0.319, -0.323,
         0.197,  0.092,  0.102,  0.081, -0.07 , -0.275,  0.422,  0.136,
        -0.253,  0.058, -0.034, -0.002, -0.091],
       [ 0.067,  0.195, -0.223, -0.218,  0.743,  0.473,  0.075, -0.07 ,
         0.014, -0.02 , -0.186,  0.068, -0.052, -0.122,  0.071, -0.082,
        -0.038,  0.044,  0.006,  0.004, -0.002,  0.019, -0.   ,  0.015,
        -0.002,  0.012, -0.   ,  0.003, -0.015],
       [ 0.178, -0.319,  0.048, -0.014,  0.006, -0.021,  0.157,  0.274,
         0.205, -0.06 , -0.372, -0.329, -0.427, -0.02 , -0.13 ,  0.023,
         0.043,  0.142, -0.076,  0.311, -0.092, -0.08 , -0.27 ,  0.183,
         0.119, -0.073, -0.012,  0.047, -0.063],
       [ 0.13 ,  0.319, -0.386,  0.032, -0.07 , -0.005,  0.111,  0.258,
        -0.23 ,  0.052,  0.204, -0.155, -0.227, -0.227, -0.245,  0.411,
         0.244, -0.304,  0.081, -0.12 , -0.071,  0.04 ,  0.015,  0.07 ,
         0.074, -0.002,  0.052,  0.005,  0.041],
       [ 0.194,  0.157,  0.002, -0.119,  0.261, -0.234, -0.102,  0.021,
        -0.145, -0.333,  0.155, -0.267,  0.091,  0.637, -0.249, -0.046,
         0.032,  0.022, -0.135, -0.062, -0.063, -0.101, -0.137, -0.122,
        -0.042,  0.048,  0.013,  0.092,  0.029]])
In [33]:
pca_out = pca.transform(df_scaled)
pca_out
Out[33]:
array([[ 1.05030157,  0.32806136, -0.26212573, ..., -0.27747282,
         0.35391138,  0.01280102],
       [ 1.13209969,  0.36087074, -0.26075422, ..., -0.38430763,
         0.31305797, -0.04196272],
       [ 1.3725952 ,  0.42210871, -0.23694544, ..., -0.32019865,
         0.36630812, -0.15413755],
       ...,
       [-0.49005491, -0.1667606 , -0.07166429, ...,  0.36594767,
        -0.11585224,  0.0633935 ],
       [-0.51580557, -0.15763107, -0.09296948, ...,  0.32501016,
        -0.08830259,  0.05705652],
       [-0.52409014, -0.14791434, -0.10220481, ...,  0.25663525,
        -0.09838061,  0.05373432]])
In [34]:
def fit_svd(X, M=1):
    U, s, V = np.linalg.svd(X, full_matrices=False)
    return U[:, :M] @ (np.diag(s[:M]) @ V[:M, :]) 
In [35]:
df_imputed = df_scaled.copy()
In [36]:
row_index = np.random.choice(len(df_imputed), size=20, replace=False)
column_index = np.random.choice(df_imputed.shape[1], size=20)
In [37]:
Xhat = df_imputed.copy()
xbar = np.nanmean(df_imputed, axis=0)
Xhat[row_index, column_index] = xbar[column_index]
In [38]:
thresh = 1e-7
rel_err = 1
iter_ = 0

ismiss = np.isnan(df_imputed)
Xscaled = (df_imputed - xbar) / np.sqrt(np.sum(~ismiss, axis=0))
Xscaled_nomiss = Xscaled[~ismiss]
mssold = np.mean(np.square(Xscaled_nomiss))
mss0 = np.mean(np.square(df_imputed[~ismiss]))
In [39]:
while rel_err > thresh:
    iter_ += 1
    Xapp = fit_svd(Xhat, M=1)
    Xhat[ismiss] = Xapp[ismiss]
    mss = np.mean(np.square(df_imputed[~ismiss] - Xapp[~ismiss]))
    rel_err = (mssold - mss) / mss0
    mssold = mss
    
    print(f"Iter: {iter_}, MSS: {mss}, Rel. Err: {rel_err}")
Iter: 1, MSS: 0.3201319244821912, Rel. Err: -0.31996852578938073
In [40]:
U, s, V = np.linalg.svd(df_scaled, full_matrices=False)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(s) + 1), s ** 2, marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Component Index')
plt.ylabel('Eigenvalue')
plt.grid(True)
plt.show()

cumulative_variance_explained = np.cumsum(s ** 2) / np.sum(s ** 2)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(s) + 1), cumulative_variance_explained, marker='o', linestyle='-')
plt.title('Cumulative Variance Explained Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.grid(True)
plt.show()
No description has been provided for this image
No description has been provided for this image

Clustering¶

K Means¶

In [41]:
inertia_values = []
k_values = range(2, 30)

for k in k_values:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(df_scaled)
    inertia_values.append(model.inertia_)
    
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia_values, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()
No description has been provided for this image
In [42]:
kmeans_2 = KMeans(n_clusters=2, random_state=42)
kmeans_2.fit(df_scaled)
labels_2 = kmeans_2.labels_

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_2)
ax.set_title("K-Means Clustering Results with K=2")
plt.show()
No description has been provided for this image
In [43]:
kmeans_17 = KMeans(n_clusters=17, random_state=42)
kmeans_17.fit(df_scaled)
labels_17 = kmeans_17.labels_

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_17)
ax.set_title("K-Means Clustering Results with K=17")
plt.show()
No description has been provided for this image
In [44]:
kmeans_29 = KMeans(n_clusters=29, random_state=42)
kmeans_29.fit(df_scaled)
labels_29 = kmeans_29.labels_

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_29)
ax.set_title("K-Means Clustering Results with K=29")
plt.show()
No description has been provided for this image
In [45]:
kmeans_2_1 = KMeans(n_clusters=2, random_state=3, n_init=1)
kmeans_2_1.fit(df_scaled)
inertia_2_1 = kmeans_2_1.inertia_

kmeans_2_20 = KMeans(n_clusters=2, random_state=3, n_init=20)
kmeans_2_20.fit(df_scaled)
inertia_2_20 = kmeans_2_20.inertia_

print("Inertia for KMeans with 2 clusters and n_init=1:", inertia_2_1)
print("Inertia for KMeans with 2 clusters and n_init=20:", inertia_2_20)

kmeans_17_1 = KMeans(n_clusters=17, random_state=3, n_init=1)
kmeans_17_1.fit(df_scaled)
inertia_17_1 = kmeans_17_1.inertia_

kmeans_17_20 = KMeans(n_clusters=17, random_state=3, n_init=20)
kmeans_17_20.fit(df_scaled)
inertia_17_20 = kmeans_17_20.inertia_

print("Inertia for KMeans with 17 clusters and n_init=1:", inertia_17_1)
print("Inertia for KMeans with 17 clusters and n_init=20:", inertia_17_20)

kmeans_29_1 = KMeans(n_clusters=29, random_state=3, n_init=1)
kmeans_29_1.fit(df_scaled)
inertia_29_1 = kmeans_29_1.inertia_

kmeans_29_20 = KMeans(n_clusters=29, random_state=3, n_init=20)
kmeans_29_20.fit(df_scaled)
inertia_29_20 = kmeans_29_20.inertia_

print("Inertia for KMeans with 29 clusters and n_init=1:", inertia_29_1)
print("Inertia for KMeans with 29 clusters and n_init=20:", inertia_29_20)
Inertia for KMeans with 2 clusters and n_init=1: 92144.82758027197
Inertia for KMeans with 2 clusters and n_init=20: 92144.82758027197
Inertia for KMeans with 17 clusters and n_init=1: 15774.632372538665
Inertia for KMeans with 17 clusters and n_init=20: 14187.884503011679
Inertia for KMeans with 29 clusters and n_init=1: 7970.138918026145
Inertia for KMeans with 29 clusters and n_init=20: 7837.581214293692

Hierarchical Clustering¶

In [46]:
numeric_df = df.drop(columns=['Entity', 'Code', 'Year'])
In [47]:
linkage_methods = ['single', 'complete', 'average']

def plot_dendrograms_sklearn(data, axes, title):
    for method, ax in zip(linkage_methods, axes.flatten()[:-1]):  
        hc = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage=method)
        hc.fit(data)
        Z = linkage(data, method=method)
        dendrogram(Z, ax=ax, truncate_mode='level', p=5, color_threshold=5, above_threshold_color='black')
        ax.set_title(f'Linkage Method: {method.capitalize()}')
    plt.suptitle(title, fontsize=20)
    plt.tight_layout(rect=[0, 0, 1, 0.96])

def plot_dendrogram_centroid(data, ax):
    Z = linkage(data, method='centroid')
    dendrogram(Z, ax=ax, truncate_mode='level', p=5, color_threshold=5, above_threshold_color='black')
    ax.set_title('Linkage Method: Centroid')

fig, axes = plt.subplots(2, 2, figsize=(20, 15))
plot_dendrograms_sklearn(numeric_df, axes, "Hierarchical Clustering with Original Data")
plot_dendrogram_centroid(numeric_df, axes.flatten()[-1])
plt.show()

fig, axes = plt.subplots(2, 2, figsize=(20, 15))
plot_dendrograms_sklearn(df_scaled, axes, "Hierarchical Clustering with Scaled Data")
plot_dendrogram_centroid(df_scaled, axes.flatten()[-1])
plt.show()
No description has been provided for this image
No description has been provided for this image
In [48]:
def cut_and_print_clusters(data, Z, n_clusters, entity_column='Entity'):
    clusters = cut_tree(Z, n_clusters=n_clusters).flatten()
    data['Cluster'] = clusters
    for cluster in range(n_clusters):
        print(f"\nCluster {cluster + 1}:")
        print(data[data['Cluster'] == cluster][['Cluster', entity_column]])

print("Original Data Clusters:")
Z_complete_original = linkage(numeric_df, method='complete')
cut_and_print_clusters(df.copy(), Z_complete_original, n_clusters=5
                      )

print("\nScaled Data Clusters:")
Z_complete_scaled = linkage(df_scaled, method='complete')
scaled_df_with_clusters = pd.DataFrame(df_scaled, columns=numeric_df.columns)
scaled_df_with_clusters['Entity'] = df['Entity'].values  
cut_and_print_clusters(scaled_df_with_clusters, Z_complete_scaled, n_clusters=5, entity_column='Entity')
Original Data Clusters:

Cluster 1:
      Cluster       Entity
0           0  Afghanistan
1           0  Afghanistan
2           0  Afghanistan
3           0  Afghanistan
4           0  Afghanistan
...       ...          ...
6835        0     Zimbabwe
6836        0     Zimbabwe
6837        0     Zimbabwe
6838        0     Zimbabwe
6839        0     Zimbabwe

[6050 rows x 2 columns]

Cluster 2:
      Cluster Entity
1140        1  China
1141        1  China
1142        1  China
1143        1  China
1144        1  China
1145        1  China
1146        1  China
1147        1  China
1148        1  China
1149        1  China
2692        1  India
2693        1  India
2694        1  India
2695        1  India
2696        1  India
2697        1  India
2698        1  India
2699        1  India

Cluster 3:
      Cluster Entity
2670        2  India
2671        2  India
2672        2  India
2673        2  India
2674        2  India
2675        2  India
2676        2  India
2677        2  India
2678        2  India
2679        2  India
2680        2  India
2681        2  India
2682        2  India

Cluster 4:
      Cluster Entity
2683        3  India
2684        3  India
2685        3  India
2686        3  India
2687        3  India
2688        3  India
2689        3  India
2690        3  India
2691        3  India

Cluster 5:
      Cluster   Entity
4170        4  Nigeria
4171        4  Nigeria
4172        4  Nigeria
4173        4  Nigeria
4174        4  Nigeria
4175        4  Nigeria
4176        4  Nigeria
4177        4  Nigeria
4178        4  Nigeria
4179        4  Nigeria
4180        4  Nigeria
4181        4  Nigeria
4182        4  Nigeria
4183        4  Nigeria
4184        4  Nigeria
4185        4  Nigeria
4186        4  Nigeria
4187        4  Nigeria
4188        4  Nigeria
4189        4  Nigeria
4190        4  Nigeria
4191        4  Nigeria
4192        4  Nigeria
4193        4  Nigeria
4194        4  Nigeria
4195        4  Nigeria
4196        4  Nigeria
4197        4  Nigeria
4198        4  Nigeria
4199        4  Nigeria

Scaled Data Clusters:

Cluster 1:
      Cluster       Entity
0           0  Afghanistan
1           0  Afghanistan
2           0  Afghanistan
3           0  Afghanistan
4           0  Afghanistan
...       ...          ...
6115        0     Zimbabwe
6116        0     Zimbabwe
6117        0     Zimbabwe
6118        0     Zimbabwe
6119        0     Zimbabwe

[6082 rows x 2 columns]

Cluster 2:
      Cluster      Entity
421         1  Bangladesh
2360        1       Haiti
2504        1   Indonesia
2520        1        Iran
3678        1     Myanmar
4125        1    Pakistan

Cluster 3:
      Cluster Entity
1110        2  China
1111        2  China
1112        2  China
1113        2  China
1114        2  China
1115        2  China
1116        2  China
1117        2  China
1118        2  China
1119        2  China
1120        2  China

Cluster 4:
      Cluster Entity
2460        3  India
2461        3  India
2462        3  India
2463        3  India
2464        3  India
2465        3  India
2466        3  India
2467        3  India
2468        3  India
2469        3  India

Cluster 5:
      Cluster Entity
2470        4  India
2471        4  India
2472        4  India
2473        4  India
2474        4  India
2475        4  India
2476        4  India
2477        4  India
2478        4  India
2479        4  India
2480        4  India
In [ ]: